#Libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.5
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(randomForestSRC)
## 
##  randomForestSRC 3.2.1 
##  
##  Type rfsrc.news() to see new features, changes, and bug fixes. 
## 
library(ggplot2)

1.- One dimensional Partial Dependence Plot.

The partial dependence plot shows the marginal effect of a feature on the predicted outcome of a previously fit model.

EXERCISE:

Apply PDP to the regression example of predicting bike rentals. Fit a random forest approximation for the prediction of bike rentals (cnt). Use the partial dependence plot to visualize the relationships the model learned. Use the slides shown in class as model.

days=read.csv("day.csv")

days$dteday <- as_date(days$dteday)
days_since <- select(days, workingday, holiday, temp, hum, windspeed, cnt)
days_since$days_since_2011 <- int_length(interval(ymd("2011-01-01"), days$dteday)) / (3600*24)
days_since$SUMMER <- ifelse(days$season == 3, 1, 0)
days_since$FALL <- ifelse(days$season == 4, 1, 0)
days_since$WINTER <- ifelse(days$season == 1, 1, 0)
days_since$MISTY <- ifelse(days$weathersit == 2, 1, 0)
days_since$RAIN <- ifelse(days$weathersit == 3 | days$weathersit == 4, 1, 0)
days_since$temp <- days_since$temp * 47 - 8
days_since$hum <- days_since$hum * 100
days_since$windspeed <- days_since$windspeed * 67

rf <- rfsrc(cnt~., data=days_since)
results <- select(days_since, days_since_2011, temp, hum, windspeed, cnt)
nr <- nrow(days_since)
for(c in names(results)[1:4])
{
  for(i in 1:nr){
    r <- days_since
    r[[c]] <- days_since[[c]][i]
    sal <- predict(rf, r)$predicted
    results[[c]][i] <- sum(sal) / nr
  }
}

p1 <- ggplot(days_since, aes(x=days_since_2011, y = results$days_since_2011)) + geom_line() + geom_rug(alpha=0.1, sides="b") + ylim(0, 6000) + xlab("Days since 2011") + ylab("Prediction")
p2 <- ggplot(days_since, aes(x=temp, y = results$temp)) + geom_line() + geom_rug(alpha=0.1, sides="b") + ylim(0, 6000) + xlab("Temperature") + ylab(NULL)
p3 <- ggplot(days_since, aes(x=hum , y = results$hum)) + geom_line() + geom_rug(alpha=0.1, sides="b") + ylim(0, 6000) + xlab("Humidity") + ylab(NULL)
p4 <- ggplot(days_since, aes(x=windspeed, y = results$windspeed)) + geom_line() + geom_rug(alpha=0.1, sides="b") + ylim(0, 6000) + xlab("Wind speed") + ylab(NULL)

subplot(p1,p2,p3,p4, shareY = T, titleY=T, titleX=T)
02004006000200040006000010203002550751000102030
Days since 2011TemperatureHumidityWind speedPrediction

QUESTION:

Analyse the influence of days since 2011, temperature, humidity and wind speed on the predicted bike counts.

days_since_2011: There is a positive correlation between the prediction of rented bicycles and the number of days since 2011. However, there is not a strong positive correlation, as the number of rented bicycles slightly decreases at certain points.

temp: The number of rented bicycles increases with temperature, but once it surpasses 25 degrees, the opposite occurs, and it begins to decrease. This makes sense, as typically, people do not feel like riding bicycles when it is either too cold or too hot.

windspeed: If the wind speed is below 25, there is a negative correlation because if there is too much wind, it is uncomfortable to ride a bicycle.We cannot say much about wind speed above 25 because there are not enough samples.

hum: We cannot say much about humidity below 50 because there are not enough samples. However, if the humidity is above 50, there is a negative correlation because as humidity increases, it becomes more uncomfortable to ride a bicycle.

2.- Bidimensional Partial Dependency Plot.

EXERCISE:

Generate a 2D Partial Dependency Plot with humidity and temperature to predict the number of bikes rented depending on those parameters.

BE CAREFUL: due to the size, extract a set of random samples from the BBDD before generating the data for the Partial Dependency Plot.

Show the density distribution of both input features with the 2D plot as shown in the class slides.

TIP: Use geom_tile() to generate the 2D plot. Set width and height to avoid holes.

sampled <- sample_n(days_since, 40)
temp <- sampled$temp
hum <- sampled$hum
th <- inner_join(data.frame(temp),data.frame(hum), by=character())
th$p <- 0
for(i in 1:nrow(th)){
  r <- days_since
  r[["temp"]] <- th[["temp"]][i]
  r[["hum"]] <- th[["hum"]][i]
  
  sal <- predict(rf, r)$predicted
  th[["p"]][i] <- sum(sal) / nr
}
ggplot(th, mapping=aes(temp, hum)) + geom_tile(mapping=aes(fill= p, width =20, height=20)) + geom_rug()+xlab("Temperature") + ylab("Humidity") + labs(fill = "Number of bikes")

QUESTION:

Interpret the results.

At a constant temperature, an increase in humidity decreases the number of bicycles rented. On the other hand, for a constant humidity, if the temperature increases, the number of bicycles rented also increases, except when the temperature exceeds 25 degrees and when it is below 10 degrees, which decreases due to extreme temperatures.

3.- PDP to explain the price of a house.

EXERCISE:

Apply the previous concepts to predict the price of a house from the database kc_house_data.csv. In this case, use again a random forest approximation for the prediction based on the features bedrooms, bathrooms, sqft_living, sqft_lot, floors and yr_built.

Use the partial dependence plot to visualize the relationships the model learned.

BE CAREFUL: due to the size, extract a set of random samples from the BBDD before generating the data for the Partial Dependency Plot.

d <- read.csv("kc_house_data.csv")
sampled <- sample_n(d, 1000)
sampled <- select(sampled, bedrooms, bathrooms, sqft_living, sqft_lot, floors, yr_built, price)
rf <- rfsrc(price~., data=sampled)
results <- select(sampled, bedrooms, bathrooms, sqft_living, floors, price)
nr <- nrow(sampled)
for(c in names(results)[1:4]){
  for(i in 1:nr){
    r <- sampled
    r[[c]] <- sampled[[c]][i]
    sal <- predict(rf, r)$predicted
    results[[c]][i] <- sum(sal) / nr
    }
}
p5 <- ggplot(data = sampled, aes(x=bedrooms, y=results$bedrooms)) + geom_line() +
  geom_rug(alpha=0.1, sides="b")+ labs(x="Bedrooms", y="Prediction") 
p6 <- ggplot(data = sampled, aes(x=bathrooms, y=results$bathrooms)) + geom_line() +
  geom_rug(alpha=0.1, sides="b") + labs(x="Bathrooms", y=NULL)+ xlim(0,5)
p7 <- ggplot(sampled, aes(x=sqft_living , y = results$sqft_living)) + geom_line() + geom_rug(alpha=0.1, sides="b") + labs(x="Sqft_living", y=NULL)
p8 <- ggplot(data = sampled, aes(x=floors, y=results$floors)) + geom_line() +
  geom_rug(alpha=0.1, sides="b") + labs(x="Floors", y=NULL) 
subplot(p5,p6,p7,p8, titleY=T, titleX=T)
24685300005400005500005600000123456e+058e+051e+062500500075001000060000090000012000001.01.52.02.53.0540000560000580000600000620000
BedroomsBathroomsSqft_livingFloorsPrediction

QUESTION:

Analyse the influence of bedrooms, bathrooms, sqft_living and floors on the predicted price.

bedrooms: The estimated price increases when going from one to two rooms, which makes sense. However, as the number of rooms increases from two to five, the estimated price surprisingly decreases, and it increases significantly when going from 5 to 6 rooms. This pattern is strange and doesn’t seem to make much sense.

bathrooms: The estimated price increases as the number of bathrooms increases, which makes sense.

sqft_living: The price increases as the square footage increases, which makes sense, since generally, the larger the space, the higher the price.

floors: Following a similar pattern to that of square footage, but with a smaller price difference, as the number of floors increases, the estimated price also increases.